How to Resolve IP Blocking?
- Modify request headers to mimic browser behavior (instead of direct code access).
- Use and rotate proxies.
- Configure access intervals.
- Acquire proxy IP addresses.
- Purchase a website for research purposes.
Code Implementation
The following code extracts proxy IP addresses from the HTML element with class="odd"
:
from bs4 import BeautifulSoup
import requests
import time
def open_proxy_url(url):
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36'
headers = {'User-Agent': user_agent}
try:
r = requests.get(url, headers=headers, timeout=20)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
print(f'Unable to access webpage: {url}')
def get_proxy_ip(response):
proxy_ip_list = []
soup = BeautifulSoup(response, 'html.parser')
proxy_ips = soup.select('.odd') # Select elements with class="odd"
for proxy_ip in proxy_ips:
ip = proxy_ip.select('td')[1].text
port = proxy_ip.select('td')[2].text
protocol = proxy_ip.select('td')[5].text
if protocol in ('HTTP', 'HTTPS'):
proxy_ip_list.append(f'{protocol}://{ip}:{port}')
return proxy_ip_list
if __name__ == '__main__':
proxy_url = 'https://www.xicidaili.com/'
text = open_proxy_url(proxy_url)
proxy_ip_filename = 'proxy_ip.txt'
with open(proxy_ip_filename, 'w') as f:
f.write(text)
text = open(proxy_ip_filename, 'r').read()
proxy_ip_list = get_proxy_ip(text)
print(proxy_ip_list)
Issue with Missing Data
Some proxy IPs are not captured because they lack class="odd"
. Modify the parser to include all <tr>
tags under id="ip_list"
:
def get_proxy_ip(response):
proxy_ip_list = []
soup = BeautifulSoup(response, 'html.parser')
proxy_ips = soup.find(id='ip_list').find_all('tr')
for proxy_ip in proxy_ips:
if len(proxy_ip.select('td')) >= 8:
ip = proxy_ip.select('td')[1].text
port = proxy_ip.select('td')[2].text
protocol = proxy_ip.select('td')[5].text.lower() # Normalize protocol
if protocol in ('http', 'https'):
proxy_ip_list.append(f'{protocol}://{ip}:{port}')
return proxy_ip_list
Using Proxies
Proxies are passed as a dictionary to the requests
method:
def open_url_using_proxy(url, proxy):
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36'
headers = {'User-Agent': user_agent}
proxies = {}
if proxy.startswith(('https', 'HTTPS')):
proxies['https'] = proxy
else:
proxies['http'] = proxy
try:
r = requests.get(url, headers=headers, proxies=proxies, timeout=10)
r.raise_for_status()
r.encoding = r.apparent_encoding
return (r.text, r.status_code)
except:
print(f'Unable to access webpage: {url}')
print(f'Invalid proxy IP: {proxy}')
return False
Proxy Validation
Verify proxy effectiveness by checking status codes and page titles (e.g., Baidu):
def check_proxy_avaliability(proxy):
url = 'http://www.baidu.com'
result = open_url_using_proxy(url, proxy)
if result:
text, status_code = result
if status_code == 200:
soup = BeautifulSoup(text, 'html.parser')
title = soup.find('title').text
if title == 'Baidu - Search':
print(f'Valid proxy IP: {proxy}')
return True
print(f'Invalid proxy IP: {proxy}')
return False
HTTP vs. HTTPS Proxies
- HTTP proxies handle HTTP requests only.
- HTTPS proxies handle HTTPS requests.
- Example proxy dictionary:
proxies = {
'http': 'http://10.10.1.10:3128',
'https': 'https://10.10.1.11:1080'
}
Use platforms like JSON IP to validate proxies.
References
- Requests Proxies Documentation: requests.readthedocs.io
- BeautifulSoup Documentation: www.crummy.com/software/BeautifulSoup/bs4/doc/